I sequenced these samples with Prime-seq on extracted RNAs already. Results were not that great. Here reprocess the same samples with TIRE-seq for a head to head comparison.
The processing went as planned. Full writeup available at ELN link
This was doone in the notebook scripts /S000553/downsample_humanTcell.sh
Prime-seq = MB2_PRM0003_YN sequenced on 2 runs:
* S000514 40M * S000493 149.8M * Combined = 189.8M / 192 samples = 988k
reads per well * The object I import was generated in
G000233_DRAC_notebooks/MB2_Tcell/1A_generate_SCE.rmd
TIRE-Seq
This is generated in 1A notebook generate SCE.
tire <- readRDS(here::here(
"data/TIRE_Tcell/SCEs/Downsample/tire_tcell_ds_basic.sce.rds"
))
prime <- readRDS(here::here(
"data/TIRE_Tcell/SCEs/Downsample/prime_tcell_ds_basic.sce.rds"
))Extract the sample metadata and unify before merging into a
tibble.
Keep only the untransduced normal T cells in the Prime-seq
experiment.
All metrics are better for TIRE-seq
Better for TIRE-seq
plt1 <- ggplot(data=tb, aes(y=sum+1, x=Reads+1, colour=Protocol)) +
geom_point() +
scale_colour_brewer(type="qualitative", palette = "Dark2") +
xlab("Reads") + ylab("Library size (UMIs)") +
scale_y_continuous(trans='log10', limits = c(1, 1e7)) +
scale_x_continuous(trans='log10', limits = c(1, 1e7)) +
annotation_logticks(base = 10, sides = "bl") +
coord_fixed(ratio = 1)
plt1This show the activation of transcription at day 1 and 2
tb <- tb %>%
mutate(Day = as.numeric(str_split(pattern = "_", Timepoint, simplify = TRUE)[,2]))
plt5 <- ggplot(data = tb, aes(y = sum + 1, x = Day, colour = Protocol)) +
geom_point() +
geom_smooth(se = TRUE, method = "loess", span = 0.6) + # Adjust `span` for better fit
scale_colour_brewer(type = "qualitative", palette = "Dark2") +
xlab("Day") +
ylab("Library size (UMIs)") +
scale_y_continuous(trans = 'log10', limits = c(1e3, 1e7)) +
annotation_logticks(base = 10, sides = "l")
plt5Better for TIRE-seq.
mapping <- tb %>%
select(Intergenic:Reads, Protocol,Well) %>%
pivot_longer(cols = c(Intergenic, Exon, Ambiguity, Unmapped),
names_to = "Feature",
values_to = "Count") %>%
mutate(Percentage = Count / Reads * 100)
plt2 <- ggplot(mapping,
aes(x = Feature, y= Percentage, colour = Protocol)) +
geom_boxplot() +
ylab("Percent") + ylim(0,100) +
xlab("") +
scale_colour_brewer(palette = "Dark2")
plt2rowData(tire)$sum <- rowSums2(counts(tire))
rowData(tire)$protocol <- "TIRE"
rowData(prime)$sum <- rowSums2(counts(prime))
rowData(prime)$protocol <- "Prime"
gene_tb <- as_tibble(rbind(
rowData(tire),
rowData(prime)
))
# Add 1 before logging
gene_tb$sum <- gene_tb$sum + 1Convert to wide tibble
gene_tb_wide <- gene_tb %>%
select(ID,Symbol,sum,protocol) %>%
pivot_wider(names_from = protocol, values_from = sum)
gene_tb_widegene_tb_wide_clean <- na.omit(gene_tb_wide)
correlation <- cor(log10(gene_tb_wide_clean$Prime), log10(gene_tb_wide_clean$TIRE), method = "pearson")The correlation is 0.8866417
plt4 <- ggplot(gene_tb_wide_clean,
aes(x = Prime, y= TIRE)) +
geom_point(alpha = 0.33, size=1) +
guides(colour = guide_legend(override.aes = list(size=3, alpha=1))) +
xlab("Prime-seq") +
ylab("TIRE-seq") +
ggtitle("Counts by gene") +
geom_smooth(method = "lm", formula = y ~ x, color = "red",
se = TRUE, level = 0.95) +
scale_y_continuous(trans='log10', limits = c(1, 1e6)) +
scale_x_continuous(trans='log10', limits = c(1, 1e6)) +
annotation_logticks(base = 10, sides = "bl") +
theme_Publication(base_size = 18)
plt4Check what genes are different between Prime and TIRE-seq particularly some outliers detected in Prime-seq.
First need to combine the objects.
keep_genes <- intersect(
row.names(tire),
rownames(prime)
)
counts <- cbind(
counts(tire[keep_genes]),
counts(prime[keep_genes])
)
dge <- DGEList(counts = counts,
group = tb$Protocol)Perform preprocessing steps
keep <- filterByExpr(dge, group=dge$samples$Timepoint, min.count=1)
dge <- dge[keep,]
dge <- calcNormFactors(dge)
design <- model.matrix(~group, data=dge$samples)
dge <- estimateDisp(dge, design)
fit <- glmQLFit(dge, design)Perform the DE test
qlf <- glmQLFTest(fit, coef = 2)
results <- as_tibble(topTags(qlf, n = Inf)$table)
results$Symbol <- row.names(topTags(qlf, n = Inf)$table)
sig_genes <- results %>%
filter(FDR < 0.05, abs(logFC) > 1)
sig_genesCreate the MA plot
ggplot(results, aes(x = logCPM, y = logFC)) +
geom_point(aes(color = FDR < 0.05), alpha = 0.3, size=0.75) +
geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
scale_color_manual(values = c("grey", "darkblue")) +
labs(
x = "Log2 CPM",
y = "Log2 FC",
color = "Significant"
) +
theme_Publication()I generated this with ChaptGPT io
I could not use edgeR BCV function bacause Prime-seq had too few counts to pass the filter by expression step.
# Start with unfiltered dgelist object
dge <- DGEList(counts = counts,
group = tb$Protocol)
# Calculate normalized counts (log-transformed counts per million)
logCPM <- edgeR::cpm(dge, log=TRUE, normalized.lib.sizes=TRUE)
# Convert to a data frame
logCPM_df <- as.data.frame(logCPM)
logCPM_df$Gene <- rownames(logCPM_df)
# Reshape the data to long format
library(reshape2)
logCPM_long <- melt(logCPM_df, id.vars = "Gene", variable.name = "Sample", value.name = "Expression")
# Add group information to the data
logCPM_long$Group <- dge$samples$group[match(logCPM_long$Sample, rownames(dge$samples))]
# Calculate mean expression per gene within each group
gene_stats <- logCPM_long %>%
group_by(Gene, Group) %>%
summarize(
Variance = var(Expression, na.rm = TRUE),
MeanExpression = mean(Expression, na.rm = TRUE)
)The shape here is typical of unfiltered counts which
is what I supplied to the plot.
If I filter counts I lose too many genes from Prime-seq.
ggplot(gene_stats, aes(x=MeanExpression, y=Variance, color=Group)) +
geom_point(alpha=0.5) +
theme_Publication() +
scale_colour_brewer(palette = "Dark2") +
xlab("Mean Genewise log-CPM") + ylab("Variance") + labs(color="Protocol")TIRE-seq better in all metrics
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Red Hat Enterprise Linux 9.4 (Plow)
##
## Matrix products: default
## BLAS: /stornext/System/data/software/rhel/9/base/tools/R/4.4.1/lib64/R/lib/libRblas.so
## LAPACK: /stornext/System/data/software/rhel/9/base/tools/R/4.4.1/lib64/R/lib/libRlapack.so; LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] reshape2_1.4.4 ggthemes_5.1.0
## [3] here_1.0.1 scater_1.32.1
## [5] scuttle_1.14.0 SingleCellExperiment_1.26.0
## [7] SummarizedExperiment_1.34.0 Biobase_2.64.0
## [9] GenomicRanges_1.56.2 GenomeInfoDb_1.40.1
## [11] IRanges_2.38.1 S4Vectors_0.42.1
## [13] BiocGenerics_0.50.0 MatrixGenerics_1.16.0
## [15] matrixStats_1.4.1 edgeR_4.2.2
## [17] limma_3.60.6 patchwork_1.3.0
## [19] platetools_0.1.7 lubridate_1.9.3
## [21] forcats_1.0.0 stringr_1.5.1
## [23] dplyr_1.1.4 purrr_1.0.2
## [25] readr_2.1.5 tidyr_1.3.1
## [27] tibble_3.2.1 ggplot2_3.5.1
## [29] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gridExtra_2.3 rlang_1.1.4
## [3] magrittr_2.0.3 compiler_4.4.1
## [5] mgcv_1.9-1 DelayedMatrixStats_1.26.0
## [7] vctrs_0.6.5 pkgconfig_2.0.3
## [9] crayon_1.5.3 fastmap_1.2.0
## [11] XVector_0.44.0 labeling_0.4.3
## [13] utf8_1.2.4 rmarkdown_2.28
## [15] tzdb_0.4.0 UCSC.utils_1.0.0
## [17] ggbeeswarm_0.7.2 xfun_0.48
## [19] zlibbioc_1.50.0 cachem_1.1.0
## [21] beachmat_2.20.0 jsonlite_1.8.9
## [23] highr_0.11 DelayedArray_0.30.1
## [25] BiocParallel_1.38.0 irlba_2.3.5.1
## [27] parallel_4.4.1 R6_2.5.1
## [29] bslib_0.8.0 stringi_1.8.4
## [31] RColorBrewer_1.1-3 jquerylib_0.1.4
## [33] Rcpp_1.0.13 knitr_1.48
## [35] splines_4.4.1 Matrix_1.7-0
## [37] timechange_0.3.0 tidyselect_1.2.1
## [39] rstudioapi_0.17.0 abind_1.4-8
## [41] yaml_2.3.10 viridis_0.6.5
## [43] codetools_0.2-20 plyr_1.8.9
## [45] lattice_0.22-6 withr_3.0.1
## [47] evaluate_1.0.1 pillar_1.9.0
## [49] generics_0.1.3 rprojroot_2.0.4
## [51] hms_1.1.3 sparseMatrixStats_1.16.0
## [53] munsell_0.5.1 scales_1.3.0
## [55] glue_1.8.0 tools_4.4.1
## [57] BiocNeighbors_1.22.0 ScaledMatrix_1.12.0
## [59] locfit_1.5-9.10 colorspace_2.1-1
## [61] nlme_3.1-164 GenomeInfoDbData_1.2.12
## [63] beeswarm_0.4.0 BiocSingular_1.20.0
## [65] vipor_0.4.7 cli_3.6.3
## [67] rsvd_1.0.5 fansi_1.0.6
## [69] S4Arrays_1.4.1 viridisLite_0.4.2
## [71] gtable_0.3.5 sass_0.4.9
## [73] digest_0.6.37 SparseArray_1.4.8
## [75] ggrepel_0.9.6 farver_2.1.2
## [77] htmltools_0.5.8.1 lifecycle_1.0.4
## [79] httr_1.4.7 statmod_1.5.0